RECompiler.java example

Explorer
damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java
package org.apache.regexp;

/*
 * ====================================================================
 * 
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 1999 The Apache Software Foundation.  All rights 
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer. 
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution, if
 *    any, must include the following acknowlegement:  
 *       "This product includes software developed by the 
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowlegement may appear in the software itself,
 *    if and wherever such third-party acknowlegements normally appear.
 *
 * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
 *    Foundation" must not be used to endorse or promote products derived
 *    from this software without prior written permission. For written 
 *    permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache"
 *    nor may "Apache" appear in their names without prior written
 *    permission of the Apache Group.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 *
 */ 

import org.apache.regexp.RE;
import java.util.Hashtable;

/**
 * A regular expression compiler class.  This class compiles a pattern string into a
 * regular expression program interpretable by the RE evaluator class.  The 'recompile'
 * command line tool uses this compiler to pre-compile regular expressions for use
 * with RE.  For a description of the syntax accepted by RECompiler and what you can
 * do with regular expressions, see the documentation for the RE matcher class.
 *
 * @see RE
 * @see recompile
 *
 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
 * @version $Id: RECompiler.java,v 1.1 2004/04/10 00:17:47 guehene Exp $
 */
public class RECompiler
{
    // The compiled program
    char[] instruction;                                 // The compiled RE 'program' instruction buffer
    int lenInstruction;                                 // The amount of the program buffer currently in use

    // Input state for compiling regular expression
    String pattern;                                     // Input string
    int len;                                            // Length of the pattern string
    int idx;                                            // Current input index into ac
    int parens;                                         // Total number of paren pairs

    // Node flags
    static final int NODE_NORMAL   = 0;                 // No flags (nothing special)
    static final int NODE_NULLABLE = 1;                 // True if node is potentially null
    static final int NODE_TOPLEVEL = 2;                 // True if top level expr

    // Special types of 'escapes'
    static final char ESC_MASK     = 0xfff0;            // Escape complexity mask
    static final char ESC_BACKREF  = 0xffff;            // Escape is really a backreference
    static final char ESC_COMPLEX  = 0xfffe;            // Escape isn't really a true character
    static final char ESC_CLASS    = 0xfffd;            // Escape represents a whole class of characters

    // {m,n} stacks
    static final int maxBrackets = 10;                  // Maximum number of bracket pairs
    static int brackets = 0;                            // Number of bracket sets
    static int[] bracketStart = null;                   // Starting point
    static int[] bracketEnd = null;                     // Ending point
    static int[] bracketMin = null;                     // Minimum number of matches
    static int[] bracketOpt = null;                     // Additional optional matches
    static final int bracketUnbounded = -1;             // Unbounded value
    static final int bracketFinished = -2;              // Unbounded value

    // Lookup table for POSIX character class names


    /**
     * @associates Character 
     */
    static Hashtable hashPOSIX = new Hashtable();
    static
    {
        hashPOSIX.put("alnum",     new Character(RE.POSIX_CLASS_ALNUM));
        hashPOSIX.put("alpha",     new Character(RE.POSIX_CLASS_ALPHA));
        hashPOSIX.put("blank",     new Character(RE.POSIX_CLASS_BLANK));
        hashPOSIX.put("cntrl",     new Character(RE.POSIX_CLASS_CNTRL));
        hashPOSIX.put("digit",     new Character(RE.POSIX_CLASS_DIGIT));
        hashPOSIX.put("graph",     new Character(RE.POSIX_CLASS_GRAPH));
        hashPOSIX.put("lower",     new Character(RE.POSIX_CLASS_LOWER));
        hashPOSIX.put("print",     new Character(RE.POSIX_CLASS_PRINT));
        hashPOSIX.put("punct",     new Character(RE.POSIX_CLASS_PUNCT));
        hashPOSIX.put("space",     new Character(RE.POSIX_CLASS_SPACE));
        hashPOSIX.put("upper",     new Character(RE.POSIX_CLASS_UPPER));
        hashPOSIX.put("xdigit",    new Character(RE.POSIX_CLASS_XDIGIT));
        hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
        hashPOSIX.put("javapart",  new Character(RE.POSIX_CLASS_JPART));
    }

    /**
     * Constructor.  Creates (initially empty) storage for a regular expression program.
     */
    public RECompiler()
    {
        // Start off with a generous, yet reasonable, initial size
        instruction = new char[128];
        lenInstruction = 0;
    }

    /**
     * Ensures that n more characters can fit in the program buffer.
     * If n more can't fit, then the size is doubled until it can.
     * @param n Number of additional characters to ensure will fit.
     */
    void ensure(int n)
    {
        // Get current program length
        int curlen = instruction.length;

        // If the current length + n more is too much
        if (lenInstruction + n >= curlen)
        {
            // Double the size of the program array until n more will fit
            while (lenInstruction + n >= curlen)
            {
                curlen *= 2;
            }

            // Allocate new program array and move data into it
            char[] newInstruction = new char[curlen];
            System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
            instruction = newInstruction;
        }
    }

    /**
     * Emit a single character into the program stream.
     * @param c Character to add
     */
    void emit(char c)
    {
        // Make room for character
        ensure(1);

        // Add character
        instruction[lenInstruction++] = c;
    }

    /**
     * Inserts a node with a given opcode and opdata at insertAt.  The node relative next
     * pointer is initialized to 0.
     * @param opcode Opcode for new node
     * @param opdata Opdata for new node (only the low 16 bits are currently used)
     * @param insertAt Index at which to insert the new node in the program
     */
    void nodeInsert(char opcode, int opdata, int insertAt)
    {
        // Make room for a new node
        ensure(RE.nodeSize);

        // Move everything from insertAt to the end down nodeSize elements
        System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
        instruction[insertAt + RE.offsetOpcode] = opcode;
        instruction[insertAt + RE.offsetOpdata] = (char)opdata;
        instruction[insertAt + RE.offsetNext] = 0;
        lenInstruction += RE.nodeSize;
    }

    /**
     * Appends a node to the end of a node chain
     * @param node Start of node chain to traverse
     * @param pointTo Node to have the tail of the chain point to
     */
    void setNextOfEnd(int node, int pointTo)
    {
        // Traverse the chain until the next offset is 0
        int next;
        while ((next = instruction[node + RE.offsetNext]) != 0)
        {
            node += next;
        }

        // Point the last node in the chain to pointTo.
        instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
    }

    /**
     * Adds a new node
     * @param opcode Opcode for node
     * @param opdata Opdata for node (only the low 16 bits are currently used)
     * @return Index of new node in program
     */
    int node(char opcode, int opdata)
    {
        // Make room for a new node
        ensure(RE.nodeSize);

        // Add new node at end
        instruction[lenInstruction + RE.offsetOpcode] = opcode;
        instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
        instruction[lenInstruction + RE.offsetNext] = 0;
        lenInstruction += RE.nodeSize;

        // Return index of new node
        return lenInstruction - RE.nodeSize;
    }


    /**
     * Throws a new internal error exception
     * @exception Error Thrown in the event of an internal error.
     */
    void internalError() throws Error
    {
        throw new Error("Internal error!");
    }

    /**
     * Throws a new syntax error exception
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    void syntaxError(String s) throws RESyntaxException
    {
        throw new RESyntaxException(s);
    }

    /**
     * Allocate storage for brackets only as needed
     */
    void allocBrackets()
    {
        // Allocate bracket stacks if not already done
        if (bracketStart == null)
        {
            // Allocate storage
            bracketStart = new int[maxBrackets];
            bracketEnd   = new int[maxBrackets];
            bracketMin   = new int[maxBrackets];
            bracketOpt   = new int[maxBrackets];

            // Initialize to invalid values
            for (int i = 0; i < maxBrackets; i++)
            {
                bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
            }
        }
    }

    /**
     * Match bracket {m,n} expression put results in bracket member variables
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    void bracket() throws RESyntaxException
    {
        // Current character must be a '{'
        if (idx >= len || pattern.charAt(idx++) != '{')
        {
            internalError();
        }

        // Next char must be a digit
        if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
        {
            syntaxError("Expected digit");
        }

        // Get min ('m' of {m,n}) number
        StringBuffer number = new StringBuffer();
        while (idx < len && Character.isDigit(pattern.charAt(idx)))
        {
            number.append(pattern.charAt(idx++));
        }
        try
        {
            bracketMin[brackets] = Integer.parseInt(number.toString());
        }
        catch (NumberFormatException e)
        {
            syntaxError("Expected valid number");
        }

        // If out of input, fail
        if (idx >= len)
        {
            syntaxError("Expected comma or right bracket");
        }

        // If end of expr, optional limit is 0
        if (pattern.charAt(idx) == '}')
        {
            idx++;
            bracketOpt[brackets] = 0;
            return;
        }

        // Must have at least {m,} and maybe {m,n}.
        if (idx >= len || pattern.charAt(idx++) != ',')
        {
            syntaxError("Expected comma");
        }

        // If out of input, fail
        if (idx >= len)
        {
            syntaxError("Expected comma or right bracket");
        }

        // If {m,} max is unlimited
        if (pattern.charAt(idx) == '}')
        {
            idx++;
            bracketOpt[brackets] = bracketUnbounded;
            return;
        }

        // Next char must be a digit
        if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
        {
            syntaxError("Expected digit");
        }

        // Get max number
        number.setLength(0);
        while (idx < len && Character.isDigit(pattern.charAt(idx)))
        {
            number.append(pattern.charAt(idx++));
        }
        try
        {
            bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
        }
        catch (NumberFormatException e)
        {
            syntaxError("Expected valid number");
        }

        // Optional repetitions must be > 0
        if (bracketOpt[brackets] <= 0)
        {
            syntaxError("Bad range");
        }

        // Must have close brace
        if (idx >= len || pattern.charAt(idx++) != '}')
        {
            syntaxError("Missing close brace");
        }
    }

    /**
     * Match an escape sequence.  Handles quoted chars and octal escapes as well
     * as normal escape characters.  Always advances the input stream by the
     * right amount. This code "understands" the subtle difference between an
     * octal escape and a backref.  You can access the type of ESC_CLASS or
     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
     * @return ESC_* code or character if simple escape
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    char escape() throws RESyntaxException
    {
        // "Shouldn't" happen
        if (pattern.charAt(idx) != '\\')
        {
            internalError();
        }

        // Escape shouldn't occur as last character in string!
        if (idx + 1 == len)
        {
            syntaxError("Escape terminates string");
        }

        // Switch on character after backslash
        idx += 2;
        char escapeChar = pattern.charAt(idx - 1);
        switch (escapeChar)
        {
            case RE.E_BOUND:
            case RE.E_NBOUND:
                return ESC_COMPLEX;

            case RE.E_ALNUM:
            case RE.E_NALNUM:
            case RE.E_SPACE:
            case RE.E_NSPACE:
            case RE.E_DIGIT:
            case RE.E_NDIGIT:
                return ESC_CLASS;

            case 'u':
            case 'x':
                {
                    // Exact required hex digits for escape type
                    int hexDigits = (escapeChar == 'u' ? 4 : 2);

                    // Parse up to hexDigits characters from input
                    int val = 0;
                    for ( ; idx < len && hexDigits-- > 0; idx++)
                    {
                        // Get char
                        char c = pattern.charAt(idx);

                        // If it's a hexadecimal digit (0-9)
                        if (c >= '0' && c <= '9')
                        {
                            // Compute new value
                            val = (val << 4) + c - '0';
                        }
                        else
                        {
                            // If it's a hexadecimal letter (a-f)
                            c = Character.toLowerCase(c);
                            if (c >= 'a' && c <= 'f')
                            {
                                // Compute new value 
                                val = (val << 4) + (c - 'a') + 10;
                            }
                            else
                            {
                                // If it's not a valid digit or hex letter, the escape must be invalid
                                // because hexDigits of input have not been absorbed yet.
                                syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
                            }
                        }
                    }
                    return (char)val;
                }

            case 't':
                return '\t';

            case 'n':
                return '\n';

            case 'r':
                return '\r';

            case 'f':
                return '\f';

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':

                // An octal escape starts with a 0 or has two digits in a row
                if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
                {
                    // Handle \nnn octal escapes
                    int val = escapeChar - '0';
                    if (idx < len && Character.isDigit(pattern.charAt(idx)))
                    {
                        val = ((val << 3) + (pattern.charAt(idx++) - '0'));
                        if (idx < len && Character.isDigit(pattern.charAt(idx)))
                        {
                            val = ((val << 3) + (pattern.charAt(idx++) - '0'));
                        }
                    }
                    return (char)val;
                }

                // It's actually a backreference (\[1-9]), not an escape
                return ESC_BACKREF;

            default:

                // Simple quoting of a character
                return escapeChar;
        }
    }

    /**
     * Compile a character class
     * @return Index of class node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int characterClass() throws RESyntaxException
    {
        // Check for bad calling or empty class
        if (pattern.charAt(idx) != '[')
        {
            internalError();
        }

        // Check for unterminated or empty class
        if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
        {
            syntaxError("Empty or unterminated class");
        }

        // Check for POSIX character class
        if (idx < len && pattern.charAt(idx) == ':')
        {
            // Skip colon
            idx++;

            // POSIX character classes are denoted with lowercase ASCII strings
            int idxStart = idx;
            while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
            {
                idx++;
            }
            
            // Should be a ":]" to terminate the POSIX character class
            if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
            {
                // Get character class
                String charClass = pattern.substring(idxStart, idx);

                // Select the POSIX class id
                Character i = (Character)hashPOSIX.get(charClass);
                if (i != null)
                {
                    // Move past colon and right bracket
                    idx += 2;

                    // Return new POSIX character class node
                    return node(RE.OP_POSIXCLASS, i.charValue());
                }
                syntaxError("Invalid POSIX character class '" + charClass + "'");
            }
            syntaxError("Invalid POSIX character class syntax");
        }

        // Try to build a class.  Create OP_ANYOF node
        int ret = node(RE.OP_ANYOF, 0);

        // Parse class declaration
        char CHAR_INVALID = Character.MAX_VALUE;
        char last = CHAR_INVALID;
        char simpleChar = 0;
        boolean include = true;
        boolean definingRange = false;
        int idxFirst = idx;
        char rangeStart = Character.MIN_VALUE;
        char rangeEnd;
        RERange range = new RERange();
        while (idx < len && pattern.charAt(idx) != ']')
        {

            switchOnCharacter:

            // Switch on character
            switch (pattern.charAt(idx))
            {
                case '^':
                    include = !include;
                    if (idx == idxFirst)
                    {
                        range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
                    }
                    idx++;
                    continue;

                case '\\':
                {
                    // Escape always advances the stream
                    char c;
                    switch (c = escape ())
                    {
                        case ESC_COMPLEX:
                        case ESC_BACKREF:

                            // Word boundaries and backrefs not allowed in a character class!
                            syntaxError("Bad character class");

                        case ESC_CLASS:

                            // Classes can't be an endpoint of a range
                            if (definingRange)
                            {
                                syntaxError("Bad character class");
                            }

                            // Handle specific type of class (some are ok)
                            switch (pattern.charAt(idx - 1))
                            {
                                case RE.E_NSPACE:
                                case RE.E_NDIGIT:
                                case RE.E_NALNUM:
                                    syntaxError("Bad character class");

                                case RE.E_SPACE:
                                    range.include('\t', include);
                                    range.include('\r', include);
                                    range.include('\f', include);
                                    range.include('\n', include);
                                    range.include('\b', include);
                                    range.include(' ', include);
                                    break;

                                case RE.E_ALNUM:
                                    range.include('a', 'z', include);
                                    range.include('A', 'Z', include);
                                    range.include('_', include);

                                    // Fall through!

                                case RE.E_DIGIT:
                                    range.include('0', '9', include);
                                    break;
                            }

                            // Make last char invalid (can't be a range start)
                            last = CHAR_INVALID;
                            break;

                        default:

                            // Escape is simple so treat as a simple char
                            simpleChar = c;
                            break switchOnCharacter;
                    }
                }
                continue;

                case '-':

                    // Start a range if one isn't already started
                    if (definingRange)
                    {
                        syntaxError("Bad class range");
                    }
                    definingRange = true;

                    // If no last character, start of range is 0
                    rangeStart = (last == CHAR_INVALID ? 0 : last);

                    // Premature end of range. define up to Character.MAX_VALUE
                    if ((idx + 1) < len && pattern.charAt(++idx) == ']')
                    {
                        simpleChar = Character.MAX_VALUE;
                        break;
                    }
                    continue;

                default:
                    simpleChar = pattern.charAt(idx++);
                    break;
            }

            // Handle simple character simpleChar
            if (definingRange)
            {
                // if we are defining a range make it now
                rangeEnd = simpleChar;

                // Actually create a range if the range is ok
                if (rangeStart >= rangeEnd)
                {
                    syntaxError("Bad character class");
                }
                range.include(rangeStart, rangeEnd, include);

                // We are done defining the range
                last = CHAR_INVALID;
                definingRange = false;
            }
            else
            {
                // If simple character and not start of range, include it
                if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
                {
                    range.include(simpleChar, include);
                }
                last = simpleChar;
            }
        }

        // Shouldn't be out of input
        if (idx == len)
        {
            syntaxError("Unterminated character class");
        }

        // Absorb the ']' end of class marker
        idx++;

        // Emit character class definition
        instruction[ret + RE.offsetOpdata] = (char)range.num;
        for (int i = 0; i < range.num; i++)
        {
            emit((char)range.minRange[i]);
            emit((char)range.maxRange[i]);
        }
        return ret;
    }

    /**
     * Absorb an atomic character string.  This method is a little tricky because
     * it can un-include the last character of string if a closure operator follows.
     * This is correct because *+? have higher precedence than concatentation (thus
     * ABC* means AB(C*) and NOT (ABC)*).
     * @return Index of new atom node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int atom() throws RESyntaxException
    {
        // Create a string node
        int ret = node(RE.OP_ATOM, 0);

        // Length of atom
        int lenAtom = 0;

        // Loop while we've got input

        atomLoop:

        while (idx < len)
        {
            // Is there a next char?
            if ((idx + 1) < len)
            {
                char c = pattern.charAt(idx + 1);

                // If the next 'char' is an escape, look past the whole escape
                if (pattern.charAt(idx) == '\\')
                {
                    int idxEscape = idx;
                    escape();
                    if (idx < len)
                    {
                        c = pattern.charAt(idx);
                    }
                    idx = idxEscape;
                }

                // Switch on next char
                switch (c)
                {
                    case '{':
                    case '?':
                    case '*':
                    case '+':

                        // If the next character is a closure operator and our atom is non-empty, the
                        // current character should bind to the closure operator rather than the atom
                        if (lenAtom != 0)
                        {
                            break atomLoop;
                        }
                }
            }

            // Switch on current char
            switch (pattern.charAt(idx))
            {
                case ']':
                case '^':
                case '$':
                case '.':
                case '[':
                case '(':
                case ')':
                case '|':
                    break atomLoop;

                case '{':
                case '?':
                case '*':
                case '+':

                    // We should have an atom by now
                    if (lenAtom == 0)
                    {
                        // No atom before closure
                        syntaxError("Missing operand to closure");
                    }
                    break atomLoop;

                case '\\':

                    {
                        // Get the escaped character (advances input automatically)
                        int idxBeforeEscape = idx;
                        char c = escape();

                        // Check if it's a simple escape (as opposed to, say, a backreference)
                        if ((c & ESC_MASK) == ESC_MASK)
                        {
                            // Not a simple escape, so backup to where we were before the escape.
                            idx = idxBeforeEscape;
                            break atomLoop;
                        }

                        // Add escaped char to atom
                        emit(c);
                        lenAtom++;
                    }
                    break;

                default:

                    // Add normal character to atom
                    emit(pattern.charAt(idx++));
                    lenAtom++;
                    break;
            }
        }

        // This "shouldn't" happen
        if (lenAtom == 0)
        {
            internalError();
        }

        // Emit the atom length into the program
        instruction[ret + RE.offsetOpdata] = (char)lenAtom;
        return ret;
    }

    /**
     * Match a terminal node.
     * @param flags Flags
     * @return Index of terminal node (closeable)
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int terminal(int[] flags) throws RESyntaxException
    {
        switch (pattern.charAt(idx))
        {
        case RE.OP_EOL:
        case RE.OP_BOL:
        case RE.OP_ANY:
            return node(pattern.charAt(idx++), 0);

        case '[':
            return characterClass();

        case '(':
            return expr(flags);

        case ')':
            syntaxError("Unexpected close paren");

        case '|':
            internalError();

        case ']':
            syntaxError("Mismatched class");

        case 0:
            syntaxError("Unexpected end of input");

        case '?':
        case '+':
        case '{':
        case '*':
            syntaxError("Missing operand to closure");

        case '\\':
            {
                // Don't forget, escape() advances the input stream!
                int idxBeforeEscape = idx;

                // Switch on escaped character
                switch (escape())
                {
                    case ESC_CLASS:
                    case ESC_COMPLEX:
                        flags[0] &= ~NODE_NULLABLE;
                        return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));

                    case ESC_BACKREF:
                        {
                            char backreference = (char)(pattern.charAt(idx - 1) - '0');
                            if (parens <= backreference)
                            {
                                syntaxError("Bad backreference");
                            }
                            flags[0] |= NODE_NULLABLE;
                            return node(RE.OP_BACKREF, backreference);
                        }

                    default:

                        // We had a simple escape and we want to have it end up in
                        // an atom, so we back up and fall though to the default handling
                        idx = idxBeforeEscape;
                        flags[0] &= ~NODE_NULLABLE;
                        break;
                }
            }
        }

        // Everything above either fails or returns.
        // If it wasn't one of the above, it must be the start of an atom.
        flags[0] &= ~NODE_NULLABLE;
        return atom();
    }

    /**
     * Compile a possibly closured terminal
     * @param flags Flags passed by reference
     * @return Index of closured node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int closure(int[] flags) throws RESyntaxException
    {
        // Before terminal
        int idxBeforeTerminal = idx;

        // Values to pass by reference to terminal()
        int[] terminalFlags = { NODE_NORMAL };

        // Get terminal symbol
        int ret = terminal(terminalFlags);

        // Or in flags from terminal symbol
        flags[0] |= terminalFlags[0];

        // Advance input, set NODE_NULLABLE flag and do sanity checks
        if (idx >= len)
        {
            return ret;
        }
        boolean greedy = true;
        char closureType = pattern.charAt(idx);
        switch (closureType)
        {
            case '?':
            case '*':

                // The current node can be null
                flags[0] |= NODE_NULLABLE;

            case '+':

                // Eat closure character
                idx++;

            case '{':

                // Don't allow blantant stupidity
                int opcode = instruction[ret + RE.offsetOpcode];
                if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
                {
                    syntaxError("Bad closure operand");
                }
                if ((terminalFlags[0] & NODE_NULLABLE) != 0)
                {
                    syntaxError("Closure operand can't be nullable");
                }
                break;
        }

        // If the next character is a '?', make the closure non-greedy (reluctant)
        if (idx < len && pattern.charAt(idx) == '?')
        {
            idx++;
            greedy = false;
        }

        if (greedy)
        {
            // Actually do the closure now
            switch (closureType)
            {
                case '{':
                {
                    // We look for our bracket in the list
                    boolean found = false;
                    int i;
                    allocBrackets();
                    for (i = 0; i < brackets; i++)
                    {
                        if (bracketStart[i] == idx)
                        {
                            found = true;
                            break;
                        }
                    }
                    
                    // If its not in the list we parse the {m,n}
                    if (!found)
                    {
                        if (brackets >= maxBrackets)
                        {
                            syntaxError("Too many bracketed closures (limit is 10)");
                        }
                        bracketStart[brackets] = idx;
                        bracket();
                        bracketEnd[brackets] = idx;
                        i = brackets++;
                    }
                    
                    // If there's a min, rewind stream and reparse
                    if (--bracketMin[i] > 0)
                    {
                        // Rewind stream and run it through again
                        idx = idxBeforeTerminal;
                        break;
                    }
                    
                    // Do the right thing for maximum ({m,})
                    if (bracketOpt[i] == bracketFinished)
                    {
                        // Drop through now and closure expression.
                        // We are done with the {m,} expr, so skip rest
                        closureType = '*';
                        bracketOpt[i] = 0;
                        idx = bracketEnd[i];
                    }
                    else
                        if (bracketOpt[i] == bracketUnbounded)
                        {
                            idx = idxBeforeTerminal;
                            bracketOpt[i] = bracketFinished;
                            break;
                        }
                        else
                            if (bracketOpt[i]-- > 0)
                            {
                                // Drop through to optionally close and then 'play it again sam!'
                                idx = idxBeforeTerminal;
                                closureType = '?';
                            }
                            else
                            {
                                // We are done. skip the rest of {m,n} expr
                                idx = bracketEnd[i];
                                break;
                            }
                }
                
                // Fall through!
                
                case '?':
                case '*':
                    
                    if (!greedy)
                    {
                        break;
                    }
                    
                    if (closureType == '?')
                    {
                        // X? is compiled as (X|)
                        nodeInsert(RE.OP_BRANCH, 0, ret);                 // branch before X
                        setNextOfEnd(ret, node (RE.OP_BRANCH, 0));        // inserted branch to option
                        int nothing = node (RE.OP_NOTHING, 0);            // which is OP_NOTHING
                        setNextOfEnd(ret, nothing);                       // point (second) branch to OP_NOTHING
                        setNextOfEnd(ret + RE.nodeSize, nothing);         // point the end of X to OP_NOTHING node
                    }
                    
                    if (closureType == '*')
                    {
                        // X* is compiled as (X{gotoX}|)
                        nodeInsert(RE.OP_BRANCH, 0, ret);                         // branch before X
                        setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0));   // end of X points to an option
                        setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0));     // to goto
                        setNextOfEnd(ret + RE.nodeSize, ret);                     // the start again
                        setNextOfEnd(ret, node(RE.OP_BRANCH, 0));                 // the other option is
                        setNextOfEnd(ret, node(RE.OP_NOTHING, 0));                // OP_NOTHING
                    }
                    break;
                    
                case '+':
                {
                    // X+ is compiled as X({gotoX}|)
                    int branch;
                    branch = node(RE.OP_BRANCH, 0);                   // a new branch
                    setNextOfEnd(ret, branch);                        // is added to the end of X
                    setNextOfEnd(node(RE.OP_GOTO, 0), ret);           // one option is to go back to the start
                    setNextOfEnd(branch, node(RE.OP_BRANCH, 0));      // the other option
                    setNextOfEnd(ret, node(RE.OP_NOTHING, 0));        // is OP_NOTHING
                }
                break;
            }
        }
        else
        {
            // Add end after closured subexpr
            setNextOfEnd(ret, node(RE.OP_END, 0));

            // Actually do the closure now
            switch (closureType)
            {
                case '?':
                    nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
                    break;
                    
                case '*':       
                    nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
                    break;

                case '+':
                    nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
                    break;
            }

            // Point to the expr after the closure
            setNextOfEnd(ret, lenInstruction);          
        }
        return ret;
    }

    /**
     * Compile one branch of an or operator (implements concatenation)
     * @param flags Flags passed by reference
     * @return Pointer to branch node
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int branch(int[] flags) throws RESyntaxException
    {
        // Get each possibly closured piece and concat
        int node;
        int ret = node(RE.OP_BRANCH, 0);
        int chain = -1;
        int[] closureFlags = new int[1];
        boolean nullable = true;
        while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
        {
            // Get new node
            closureFlags[0] = NODE_NORMAL;
            node = closure(closureFlags);
            if (closureFlags[0] == NODE_NORMAL)
            {
                nullable = false;
            }

            // If there's a chain, append to the end
            if (chain != -1)
            {
                setNextOfEnd(chain, node);
            }

            // Chain starts at current
            chain = node;
        }

        // If we don't run loop, make a nothing node
        if (chain == -1)
        {
            node(RE.OP_NOTHING, 0);
        }

        // Set nullable flag for this branch
        if (nullable)
        {
            flags[0] |= NODE_NULLABLE;
        }
        return ret;
    }

    /**
     * Compile an expression with possible parens around it.  Paren matching
     * is done at this level so we can tie the branch tails together.
     * @param flags Flag value passed by reference
     * @return Node index of expression in instruction array
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     */
    int expr(int[] flags) throws RESyntaxException
    {
        // Create open paren node unless we were called from the top level (which has no parens)
        boolean paren = false;
        int ret = -1;
        int closeParens = parens;
        if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
        {
            idx++;
            paren = true;
            ret = node(RE.OP_OPEN, parens++);
        }
        flags[0] &= ~NODE_TOPLEVEL;

        // Create a branch node
        int branch = branch(flags);
        if (ret == -1)
        {
            ret = branch;
        }
        else
        {
            setNextOfEnd(ret, branch);
        }

        // Loop through branches
        while (idx < len && pattern.charAt(idx) == '|')
        {
            idx++;
            branch = branch(flags);
            setNextOfEnd(ret, branch);
        }

        // Create an ending node (either a close paren or an OP_END)
        int end;
        if (paren)
        {
            if (idx < len && pattern.charAt(idx) == ')')
            {
                idx++;
            }
            else
            {
                syntaxError("Missing close paren");
            }
            end = node(RE.OP_CLOSE, closeParens);
        }
        else
        {
            end = node(RE.OP_END, 0);
        }

        // Append the ending node to the ret nodelist
        setNextOfEnd(ret, end);

        // Hook the ends of each branch to the end node
        for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
        {
            // If branch, make the end of the branch's operand chain point to the end node.
            if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
            {
                setNextOfEnd(i + RE.nodeSize, end);
            }
        }

        // Return the node list
        return ret;
    }

    /**
     * Compiles a regular expression pattern into a program runnable by the pattern
     * matcher class 'RE'.
     * @param pattern Regular expression pattern to compile (see RECompiler class
     * for details).
     * @return A compiled regular expression program.
     * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
     * @see RECompiler
     * @see RE
     */
    public REProgram compile(String pattern) throws RESyntaxException
    {
        // Initialize variables for compilation
        this.pattern = pattern;                         // Save pattern in instance variable
        len = pattern.length();                         // Precompute pattern length for speed
        idx = 0;                                        // Set parsing index to the first character
        lenInstruction = 0;                             // Set emitted instruction count to zero
        parens = 1;                                     // Set paren level to 1 (the implicit outer parens)
        brackets = 0;                                   // No bracketed closures yet

        // Initialize pass by reference flags value
        int[] flags = { NODE_TOPLEVEL };

        // Parse expression
        expr(flags);

        // Should be at end of input
        if (idx != len)
        {
            if (pattern.charAt(idx) == ')')
            {
                syntaxError("Unmatched close paren");
            }
            syntaxError("Unexpected input remains");
        }

        // Return the result
        char[] ins = new char[lenInstruction];
        System.arraycopy(instruction, 0, ins, 0, lenInstruction);
        return new REProgram(ins);
    }

    /**
     * Local, nested class for maintaining character ranges for character classes.
     */
    class RERange
    {
        int size = 16;                      // Capacity of current range arrays
        int[] minRange = new int[size];     // Range minima
        int[] maxRange = new int[size];     // Range maxima
        int num = 0;                        // Number of range array elements in use
        
        /**
         * Deletes the range at a given index from the range lists
         * @param index Index of range to delete from minRange and maxRange arrays.
         */
        void delete(int index)
        {
            // Return if no elements left or index is out of range
            if (num == 0 || index >= num)
            {
                return;
            }
            
            // Move elements down
            while (index++ < num)
            {
                if (index - 1 >= 0)
                {
                    minRange[index-1] = minRange[index];
                    maxRange[index-1] = maxRange[index];
                }
            }
            
            // One less element now
            num--;
        }
        
        /**
         * Merges a range into the range list, coalescing ranges if possible.
         * @param min Minimum end of range
         * @param max Maximum end of range
         */
        void merge(int min, int max)
        {
            // Loop through ranges
            for (int i = 0; i < num; i++)
            {
                // Min-max is subsumed by minRange[i]-maxRange[i]
                if (min >= minRange[i] && max <= maxRange[i])
                {
                    return;
                }
                
                // Min-max subsumes minRange[i]-maxRange[i]
                else if (min <= minRange[i] && max >= maxRange[i])
                {
                    delete(i);
                    merge(min, max);
                    return;
                }
                
                // Min is in the range, but max is outside
                else if (min >= minRange[i] && min <= maxRange[i])
                {
                    delete(i);
                    min = minRange[i];
                    merge(min, max);
                    return;
                }
                
                // Max is in the range, but min is outside
                else if (max >= minRange[i] && max <= maxRange[i])
                {
                    delete(i);
                    max = maxRange[i];
                    merge(min, max);
                    return;
                }
            }
            
            // Must not overlap any other ranges
            if (num >= size)
            {
                size *= 2;
                int[] newMin = new int[size];
                int[] newMax = new int[size];
                System.arraycopy(minRange, 0, newMin, 0, num);
                System.arraycopy(maxRange, 0, newMax, 0, num);
                minRange = newMin;
                maxRange = newMax;
            }
            minRange[num] = min;
            maxRange[num] = max;
            num++;
        }
        
        /**
         * Removes a range by deleting or shrinking all other ranges
         * @param min Minimum end of range
         * @param max Maximum end of range
         */
        void remove(int min, int max)
        {
            // Loop through ranges
            for (int i = 0; i < num; i++)
            {
                // minRange[i]-maxRange[i] is subsumed by min-max
                if (minRange[i] >= min && maxRange[i] <= max)
                {
                    delete(i);
                    i--;
                    return;
                }
                
                // min-max is subsumed by minRange[i]-maxRange[i]
                else if (min >= minRange[i] && max <= maxRange[i])
                {
                    int minr = minRange[i];
                    int maxr = maxRange[i];
                    delete(i);
                    if (minr < min - 1)
                    {
                        merge(minr, min - 1);
                    }
                    if (max + 1 < maxr)
                    {
                        merge(max + 1, maxr);
                    }
                    return;
                }
                
                // minRange is in the range, but maxRange is outside
                else if (minRange[i] >= min && minRange[i] <= max)
                {
                    minRange[i] = max + 1;
                    return;
                }
                
                // maxRange is in the range, but minRange is outside
                else if (maxRange[i] >= min && maxRange[i] <= max)
                {
                    maxRange[i] = min - 1;
                    return;
                }
            }
        }
        
        /**
         * Includes (or excludes) the range from min to max, inclusive.
         * @param min Minimum end of range
         * @param max Maximum end of range
         * @param include True if range should be included.  False otherwise.
         */
        void include(int min, int max, boolean include)
        {
            if (include)
            {
                merge(min, max);
            }
            else
            {
                remove(min, max);
            }
        }
        
        /**
         * Includes a range with the same min and max
         * @param minmax Minimum and maximum end of range (inclusive)
         * @param include True if range should be included.  False otherwise.
         */
        void include(char minmax, boolean include)
        {
            include(minmax, minmax, include);
        }
    }
}